*****************************************************************
* Replication directory for                                   ***
* Prime locations                                             ***
* by Gabriel M. Ahlfeldt, Thilo N.H. Albers, Kristian Behrens ***
* Published in American Economic Review: Insights             ***
*****************************************************************
* 01/2025
* Stata
version 17.0

cd "$root"

* This do file calls Python geopandas to compute distances from block groups to prime locations
* Execution requires a Python working environment with packages mentioned at the beginning of the Python code
* Should the code not run through in Stata, the part between "python:" and "end" can be executed in Python
* In this case, you must manually set the root directory in line 26. 

python:

import os
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
from math import radians, sin, cos, sqrt, atan2

# Set root directory to Stata's current working directory
root_directory = os.getcwd()

# File paths (relative to root_directory)
block_group_file = os.path.join(root_directory, r'_Data/US METRO DATA/BlockData/block_group_coordinates.csv')
pl_file = os.path.join(root_directory, r'_Work/Output/Data/US-CBSAs/PL-data.csv')
output_file = os.path.join(root_directory, r'_Work/TEMP/GISUS_using/BlockGroup2PLdist.csv')

# Haversine function
def haversine(lon1, lat1, lon2, lat2):
    R = 6371000  # Earth's radius in meters
    dlon = radians(lon2 - lon1)
    dlat = radians(lat2 - lat1)
    a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

# Load data
block_df = pd.read_csv(block_group_file)
pl_df = pd.read_csv(pl_file)

# Initialize lists for output
nearest_distances = []
nearest_pl_ids = []

# Total count for progress tracking
total_blocks = len(block_df)

# Loop through each block group and calculate distance to each prime location
for index, block in block_df.iterrows():
    block_lat, block_lon = block['intptlat10'], block['intptlon10']
    min_distance = float('inf')
    nearest_plid = None

    for _, pl in pl_df.iterrows():
        pl_lat, pl_lon = pl['PL_lat'], pl['PL_lon']
        distance = haversine(block_lon, block_lat, pl_lon, pl_lat)

        if distance < min_distance:
            min_distance = distance
            nearest_plid = pl['PLID_US']
    
    nearest_distances.append(min_distance)
    nearest_pl_ids.append(nearest_plid)

    # Print progress every 100 block groups processed
    if (index + 1) % 100 == 0 or (index + 1) == total_blocks:
        print(f"Processed {index + 1} out of {total_blocks} block groups...")

# Add results to block group DataFrame
block_df['DistPL'] = nearest_distances
block_df['plid_us'] = nearest_pl_ids

# Save the output
block_df[['geoid', 'DistPL', 'plid_us']].to_csv(output_file, index=False)

print(f"Output saved to: {output_file}")

end

* Script ends
